library(knitr)
# install.packages("readr")
library(readr)
library(corrplot)
## corrplot 0.92 loaded
# Karolina:
# setwd("C:/Users/karla/OneDrive/Pulpit/Credit Risk/Projekt")
# app_train3<-read_csv("app_test3.csv")
# app_test3<-read_csv("app_train3.csv")

# Zuzia:
setwd("/Users/zuzanna/Desktop/Studia/CreditRisk/projekt")
load("dane/app_train_prep.RData")
load("dane/app_test_prep.RData")

Introduction

This is an overview of our initially prepared data. We extracted the applicant’s profiles from Kaggle.

The summary of Train Data.

summary(app_train3)
##    SK_ID_CURR         TARGET        NAME_CONTRACT_TYPE CODE_GENDER       
##  Min.   :100002   Min.   :0.00000   Length:307511      Length:307511     
##  1st Qu.:189146   1st Qu.:0.00000   Class :character   Class :character  
##  Median :278202   Median :0.00000   Mode  :character   Mode  :character  
##  Mean   :278180   Mean   :0.08073                                        
##  3rd Qu.:367142   3rd Qu.:0.00000                                        
##  Max.   :456255   Max.   :1.00000                                        
##                                                                          
##  FLAG_OWN_CAR       FLAG_OWN_REALTY     CNT_CHILDREN     AMT_INCOME_TOTAL   
##  Length:307511      Length:307511      Min.   : 0.0000   Min.   :    25650  
##  Class :character   Class :character   1st Qu.: 0.0000   1st Qu.:   112500  
##  Mode  :character   Mode  :character   Median : 0.0000   Median :   147150  
##                                        Mean   : 0.4171   Mean   :   168798  
##                                        3rd Qu.: 1.0000   3rd Qu.:   202500  
##                                        Max.   :19.0000   Max.   :117000000  
##                                                                             
##    AMT_CREDIT       AMT_ANNUITY     AMT_GOODS_PRICE   NAME_TYPE_SUITE   
##  Min.   :  45000   Min.   :  1616   Min.   :  40500   Length:307511     
##  1st Qu.: 270000   1st Qu.: 16524   1st Qu.: 238500   Class :character  
##  Median : 513531   Median : 24903   Median : 450000   Mode  :character  
##  Mean   : 599026   Mean   : 27109   Mean   : 538396                     
##  3rd Qu.: 808650   3rd Qu.: 34596   3rd Qu.: 679500                     
##  Max.   :4050000   Max.   :258026   Max.   :4050000                     
##                    NA's   :12       NA's   :278                         
##  NAME_EDUCATION_TYPE NAME_FAMILY_STATUS DAYS_EMPLOYED    REGION_RATING_CLIENT
##  Length:307511       Length:307511      Min.   :-17912   Min.   :1.000       
##  Class :character    Class :character   1st Qu.: -2760   1st Qu.:2.000       
##  Mode  :character    Mode  :character   Median : -1213   Median :2.000       
##                                         Mean   : 63815   Mean   :2.052       
##                                         3rd Qu.:  -289   3rd Qu.:2.000       
##                                         Max.   :365243   Max.   :3.000       
##                                                                              
##     DOCUMENT        EXT_SOURCE_1     EXT_SOURCE_2     EXT_SOURCE_3  
##  Min.   :0.00000   Min.   :0.01     Min.   :0.0000   Min.   :0.00   
##  1st Qu.:0.05000   1st Qu.:0.33     1st Qu.:0.3925   1st Qu.:0.37   
##  Median :0.05000   Median :0.51     Median :0.5660   Median :0.54   
##  Mean   :0.04651   Mean   :0.50     Mean   :0.5144   Mean   :0.51   
##  3rd Qu.:0.05000   3rd Qu.:0.68     3rd Qu.:0.6636   3rd Qu.:0.67   
##  Max.   :0.20000   Max.   :0.96     Max.   :0.8550   Max.   :0.90   
##                    NA's   :173378   NA's   :660      NA's   :60965  
##  AMT_REQ_CREDIT_BUREAU_QRT
##  Min.   :  0.00           
##  1st Qu.:  0.00           
##  Median :  0.00           
##  Mean   :  0.27           
##  3rd Qu.:  0.00           
##  Max.   :261.00           
##  NA's   :41519



The summary of Test Data.

summary(app_test3)
##    SK_ID_CURR     NAME_CONTRACT_TYPE CODE_GENDER        FLAG_OWN_CAR      
##  Min.   :100001   Length:48744       Length:48744       Length:48744      
##  1st Qu.:188558   Class :character   Class :character   Class :character  
##  Median :277549   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :277797                                                           
##  3rd Qu.:367556                                                           
##  Max.   :456250                                                           
##                                                                           
##  FLAG_OWN_REALTY     CNT_CHILDREN     AMT_INCOME_TOTAL    AMT_CREDIT     
##  Length:48744       Min.   : 0.0000   Min.   :  26942   Min.   :  45000  
##  Class :character   1st Qu.: 0.0000   1st Qu.: 112500   1st Qu.: 260640  
##  Mode  :character   Median : 0.0000   Median : 157500   Median : 450000  
##                     Mean   : 0.3971   Mean   : 178432   Mean   : 516740  
##                     3rd Qu.: 1.0000   3rd Qu.: 225000   3rd Qu.: 675000  
##                     Max.   :20.0000   Max.   :4410000   Max.   :2245500  
##                                                                          
##   AMT_ANNUITY     AMT_GOODS_PRICE   NAME_TYPE_SUITE    NAME_EDUCATION_TYPE
##  Min.   :  2295   Min.   :  45000   Length:48744       Length:48744       
##  1st Qu.: 17973   1st Qu.: 225000   Class :character   Class :character   
##  Median : 26199   Median : 396000   Mode  :character   Mode  :character   
##  Mean   : 29426   Mean   : 462619                                         
##  3rd Qu.: 37390   3rd Qu.: 630000                                         
##  Max.   :180576   Max.   :2245500                                         
##  NA's   :24                                                               
##  NAME_FAMILY_STATUS DAYS_EMPLOYED    REGION_RATING_CLIENT    DOCUMENT      
##  Length:48744       Min.   :-17463   Min.   :1.000        Min.   :0.00000  
##  Class :character   1st Qu.: -2910   1st Qu.:2.000        1st Qu.:0.05000  
##  Mode  :character   Median : -1293   Median :2.000        Median :0.05000  
##                     Mean   : 67485   Mean   :2.038        Mean   :0.04923  
##                     3rd Qu.:  -296   3rd Qu.:2.000        3rd Qu.:0.05000  
##                     Max.   :365243   Max.   :3.000        Max.   :0.05000  
##                                                                            
##   EXT_SOURCE_1    EXT_SOURCE_2       EXT_SOURCE_3   AMT_REQ_CREDIT_BUREAU_QRT
##  Min.   :0.013   Min.   :0.000008   Min.   :0.001   Min.   :0.000            
##  1st Qu.:0.344   1st Qu.:0.408066   1st Qu.:0.364   1st Qu.:0.000            
##  Median :0.507   Median :0.558758   Median :0.519   Median :0.000            
##  Mean   :0.501   Mean   :0.518021   Mean   :0.500   Mean   :0.547            
##  3rd Qu.:0.666   3rd Qu.:0.658497   3rd Qu.:0.653   3rd Qu.:1.000            
##  Max.   :0.939   Max.   :0.855000   Max.   :0.883   Max.   :7.000            
##  NA's   :20532   NA's   :8          NA's   :8668    NA's   :6049



0. TARGET

unique(app_train3$TARGET)
## [1] 1 0
app_train3$TARGET <- as.numeric(app_train3$TARGET)
summary(app_train3$TARGET)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.08073 0.00000 1.00000
targetTR <- table(app_train3$TARGET)
barplot(targetTR, main = "Target variable is train sample",
        ylab = "Count",
        names.arg = c("0", "1"),
        col = c("grey", "orange"))

1. NAME_CONTRACT_TYPE

We can observe the contract type is a text variable. We need to transform it to categorical data type.

unique(app_train3$NAME_CONTRACT_TYPE)
## [1] "Cash loans"      "Revolving loans"
unique(app_test3$NAME_CONTRACT_TYPE)
## [1] "Cash loans"      "Revolving loans"


After transformation:

Cash loans = 0 and Revolving loans = 1

## [1] 0 1
## [1] TRUE
## [1] TRUE
## 
##      0      1 
## 278232  29279

## 
##     0     1 
## 48305   439




2. CODE_GENDER

We can observe the gender is a text variable. We need to transform it to binary data type.

unique(app_train3$CODE_GENDER)
## [1] "M"   "F"   "XNA"
unique(app_test3$CODE_GENDER)
## [1] "F" "M"


After transformation:

## [1] 0 1
## [1] 1 0
## [1] TRUE
## [1] TRUE




3. FLAG_OWN_CAR

We can observe the owning car is a text variable. We need to transform it to binary data type.

unique(app_train3$FLAG_OWN_CAR)
## [1] "N" "Y"
unique(app_test3$FLAG_OWN_CAR)
## [1] "N" "Y"


After transformation:

## [1] 0 1
## [1] 0 1
## [1] TRUE
## [1] TRUE




4. FLAG_OWN_REALTY

We can observe the owning a flat or house is a text variable. We need to transform it to binary data type.

unique(app_train3$FLAG_OWN_REALTY)
## [1] "Y" "N"
unique(app_test3$FLAG_OWN_REALTY)
## [1] "Y" "N"


After transformation:

## [1] 1 0
## [1] 1 0
## [1] TRUE
## [1] TRUE




5. CNT_CHILDREN


unique(app_train3$CNT_CHILDREN)
##  [1]  0  1  2  3  4  7  5  6  8  9 11 12 10 19 14
unique(app_test3$CNT_CHILDREN)
##  [1]  0  2  1  3  8  4  6  5  7 20 11




In this case, we will treat 4 children and more as multiple families. We need to cap the data.

Transform the variable to binary.

On this stage, our children variable is balanced for train sample and imbalanced for test sample.




6. AMT_INCOME_TOTAL

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26942  112500  157500  178432  225000 4410000
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     25650    112500    147150    168798    202500 117000000



We can observe that the outliers are only form the right side. We use winsorization.

7. AMT_CREDIT

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   45000  260640  450000  516740  675000 2245500
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   45000  270000  513531  599026  808650 4050000



Applying logarithmic transformation.

app_train3$AMT_CREDIT <- log(app_train3$AMT_CREDIT)
app_test3$AMT_CREDIT <- log(app_test3$AMT_CREDIT)

hist(app_test3$AMT_CREDIT, 
     main = "Distribution of CA in test sample after logarithimic transformation",
     xlab = "Credit Total")

hist(app_train3$AMT_CREDIT, 
     main = "Distribution of CA in train sample after logarithimic transformation",
     xlab = "Credit Total")

8. AMT_ANNUITY

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2295   17973   26199   29426   37390  180576      24
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1616   16524   24903   27109   34596  258026      12


We need to handle NA’s (by imputation), and distribution applying logarithmic transformation.

9. AMT_GOODS_PRICE

# Summary statistics of AMT_GOODS_PRICE
summary(app_test3$AMT_GOODS_PRICE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   45000  225000  396000  462619  630000 2245500
summary(app_train3$AMT_GOODS_PRICE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   40500  238500  450000  538396  679500 4050000     278
# Create a subset of the data without missing values
no_missing_train <- app_train3$AMT_GOODS_PRICE[!is.na(app_train3$AMT_GOODS_PRICE)]
no_missing_test <- app_test3$AMT_GOODS_PRICE[!is.na(app_test3$AMT_GOODS_PRICE)]

# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$AMT_GOODS_PRICE)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$AMT_GOODS_PRICE)), replace = TRUE)

# Replace missing values with random values in the train dataset
app_train3$AMT_GOODS_PRICE[is.na(app_train3$AMT_GOODS_PRICE)] <- random_values_train

# Replace missing values with random values in the test dataset
app_test3$AMT_GOODS_PRICE[is.na(app_test3$AMT_GOODS_PRICE)] <- random_values_test

# Plot the distribution of AMT_GOODS_PRICE
hist(app_test3$AMT_GOODS_PRICE,
     main = "Distribution of Goods Price in Test Sample",
     xlab = "Goods Price")

hist(app_train3$AMT_GOODS_PRICE,
     main = "Distribution of Goods Price in Train Sample",
     xlab = "Goods Price")

# Boxplots
boxplot(app_test3$AMT_GOODS_PRICE,
        main = "Boxplot of Goods Price in Test Sample",
        ylab = "Goods Price")

boxplot(app_train3$AMT_GOODS_PRICE,
        main = "Boxplot of Goods Price in Train Sample",
        ylab = "Goods Price")


Apply the logarithmic transformation.

app_test3$AMT_GOODS_PRICE <- log(app_test3$AMT_GOODS_PRICE)
app_train3$AMT_GOODS_PRICE <- log(app_train3$AMT_GOODS_PRICE)

# Plot the distribution of AMT_GOODS_PRICE
hist(app_test3$AMT_GOODS_PRICE,
     main = "Distribution of Goods Price in Test Sample after transformation",
     xlab = "Goods Price")

hist(app_train3$AMT_GOODS_PRICE,
     main = "Distribution of Goods Price in Train Sample after transformation",
     xlab = "Goods Price")



10. NAME_TYPE_SUITE

# Summary statistics of NAME_TYPE_SUITE
summary(app_test3$NAME_TYPE_SUITE)
##    Length     Class      Mode 
##     48744 character character
summary(app_train3$NAME_TYPE_SUITE)
##    Length     Class      Mode 
##    307511 character character
# We can observe the NAME_TYPE_SUITE is a text variable. We need to transform it to categorical data type.
unique(app_train3$NAME_TYPE_SUITE)
## [1] "Unaccompanied"   "Family"          "Spouse, partner" "Children"       
## [5] "Other_A"         NA                "Other_B"         "Group of people"
unique(app_test3$NAME_TYPE_SUITE)
## [1] "Unaccompanied"   NA                "Family"          "Spouse, partner"
## [5] "Group of people" "Other_B"         "Children"        "Other_A"
# So we will create a mapping of categories to numeric values
category_mapping <- c("Unaccompanied" = 1, "Family" = 2, "Spouse, partner" = 3, "Group of people" = 4, "Other_B" = 5, "Children" = 6, "Other_A" = 7)

# Then convert the "NAME_TYPE_SUITE" column to a factor with the specified categories
app_train3$NAME_TYPE_SUITE <- factor(app_train3$NAME_TYPE_SUITE, levels = names(category_mapping))
app_test3$NAME_TYPE_SUITE <- factor(app_test3$NAME_TYPE_SUITE, levels = names(category_mapping))

# Assign numeric values to the factor levels based on the mapping
app_train3$NAME_TYPE_SUITE <- as.integer(app_train3$NAME_TYPE_SUITE)
app_test3$NAME_TYPE_SUITE <- as.integer(app_test3$NAME_TYPE_SUITE)

# Now we can verify the changes
unique(app_test3$NAME_TYPE_SUITE)
## [1]  1 NA  2  3  4  5  6  7
# and we saw hat we need to handle missing data
# Create a subset of the data without missing values
no_missing_train <- app_train3$NAME_TYPE_SUITE[!is.na(app_train3$NAME_TYPE_SUITE)]
no_missing_test <- app_test3$NAME_TYPE_SUITE[!is.na(app_test3$NAME_TYPE_SUITE)]

# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$NAME_TYPE_SUITE)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$NAME_TYPE_SUITE)), replace = TRUE)

# Replace missing values with random values in the train dataset
app_train3$NAME_TYPE_SUITE[is.na(app_train3$NAME_TYPE_SUITE)] <- random_values_train

# Replace missing values with random values in the test dataset
app_test3$NAME_TYPE_SUITE[is.na(app_test3$NAME_TYPE_SUITE)] <- random_values_test

# Plot the distribution of AMT_ANNUITY
hist(app_test3$NAME_TYPE_SUITE, 
     main = "Distribution of type suite in test sample",
     xlab = "Type suite")

hist(app_train3$NAME_TYPE_SUITE, 
     main = "Distribution of type suite in train sample",
     xlab = "Type suite")


After checking the distribution, we decide to parse the NAME_TYPE_SUITE as binary variable.

app_test3$NAME_TYPE_SUITE <- ifelse(app_test3$NAME_TYPE_SUITE == '1', 0, 1)
app_train3$NAME_TYPE_SUITE <- ifelse(app_train3$NAME_TYPE_SUITE == '1', 0, 1)

hist(app_test3$NAME_TYPE_SUITE, 
     main = "Distribution of type suite in test sample",
     xlab = "Type suite")

hist(app_train3$NAME_TYPE_SUITE, 
     main = "Distribution of type suite in train sample",
     xlab = "Type suite")


11. NAME_EDUCATION_TYPE

# Summary statistics of NAME_EDUCATION_TYPE
summary(app_test3$NAME_EDUCATION_TYPE)
##    Length     Class      Mode 
##     48744 character character
summary(app_train3$NAME_EDUCATION_TYPE)
##    Length     Class      Mode 
##    307511 character character
# We can observe the NAME_EDUCATION_TYPE is a text variable. We need to transform it to categorical data type.
unique(app_train3$NAME_EDUCATION_TYPE)
## [1] "Secondary / secondary special" "Higher education"             
## [3] "Incomplete higher"             "Lower secondary"              
## [5] "Academic degree"
unique(app_test3$NAME_EDUCATION_TYPE)
## [1] "Higher education"              "Secondary / secondary special"
## [3] "Incomplete higher"             "Lower secondary"              
## [5] "Academic degree"
# So we will create a mapping of categories to numeric values
category_mapping <- c("Lower secondary" = 1, "Secondary / secondary special" = 2, "Incomplete higher" = 3, "Higher education" = 4, "Academic degree" = 5)

# Then convert the "NAME_EDUCATION_TYPE" column to a factor with the specified categories
app_train3$NAME_EDUCATION_TYPE <- factor(app_train3$NAME_EDUCATION_TYPE, levels = names(category_mapping))
app_test3$NAME_EDUCATION_TYPE <- factor(app_test3$NAME_EDUCATION_TYPE, levels = names(category_mapping))

# Assign numeric values to the factor levels based on the mapping
app_train3$NAME_EDUCATION_TYPE <- as.integer(app_train3$NAME_EDUCATION_TYPE)
app_test3$NAME_EDUCATION_TYPE <- as.integer(app_test3$NAME_EDUCATION_TYPE)

# Now we can verify the changes
unique(app_test3$NAME_EDUCATION_TYPE)
## [1] 4 2 3 1 5
# and we saw hat we need to handle missing data
# Create a subset of the data without missing values
no_missing_train <- app_train3$NAME_EDUCATION_TYPE[!is.na(app_train3$NAME_EDUCATION_TYPE)]
no_missing_test <- app_test3$NAME_EDUCATION_TYPE[!is.na(app_test3$NAME_EDUCATION_TYPE)]

# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$NAME_EDUCATION_TYPE)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$NAME_EDUCATION_TYPE)), replace = TRUE)

# Replace missing values with random values in the train dataset
app_train3$NAME_EDUCATION_TYPE[is.na(app_train3$NAME_EDUCATION_TYPE)] <- random_values_train

# Replace missing values with random values in the test dataset
app_test3$NAME_EDUCATION_TYPE[is.na(app_test3$NAME_EDUCATION_TYPE)] <- random_values_test

# Plot the distribution of AMT_ANNUITY
hist(app_test3$NAME_EDUCATION_TYPE, 
     main = "Distribution of education type in test sample",
     xlab = "education type")

hist(app_train3$NAME_EDUCATION_TYPE, 
     main = "Distribution of education type in train sample",
     xlab = "education type")

After checking the distribution, we decide to parse the NAME_EDUCATION_TYPE as binary variable. Higher education and above is 1 and lower is 0.

app_test3$NAME_EDUCATION_TYPE <- ifelse(app_test3$NAME_EDUCATION_TYPE == '1', 0, 
                (ifelse(app_test3$NAME_EDUCATION_TYPE == '2', 0, 1)))
app_train3$NAME_EDUCATION_TYPE <- ifelse(app_train3$NAME_EDUCATION_TYPE == '1', 0, 
                (ifelse(app_train3$NAME_EDUCATION_TYPE == '2', 0, 1)))

hist(app_test3$NAME_EDUCATION_TYPE, 
     main = "Distribution of education in test sample",
     xlab = "Educ")

hist(app_train3$NAME_EDUCATION_TYPE, 
     main = "Distribution of education in train sample",
     xlab = "Educ")



12. NAME_FAMILY_STATUS

# Summary statistics of NAME_FAMILY_STATUS
summary(app_test3$NAME_FAMILY_STATUS)
##    Length     Class      Mode 
##     48744 character character
summary(app_train3$NAME_FAMILY_STATUS)
##    Length     Class      Mode 
##    307511 character character
# We can observe the NAME_FAMILY_STATUS is a text variable. We need to transform it to categorical data type.
unique(app_train3$NAME_FAMILY_STATUS)
## [1] "Single / not married" "Married"              "Civil marriage"      
## [4] "Widow"                "Separated"            "Unknown"
unique(app_test3$NAME_FAMILY_STATUS)
## [1] "Married"              "Single / not married" "Civil marriage"      
## [4] "Widow"                "Separated"
# So we will create a mapping of categories to numeric values
category_mapping <- c("Single / not married" = 1, "Civil marriage" = 2, "Married" = 3, "Separated" = 4, "Widow" = 5)

# Then convert the "NAME_FAMILY_STATUS" column to a factor with the specified categories
app_train3$NAME_FAMILY_STATUS <- factor(app_train3$NAME_FAMILY_STATUS, levels = names(category_mapping))
app_test3$NAME_FAMILY_STATUS <- factor(app_test3$NAME_FAMILY_STATUS, levels = names(category_mapping))

# Assign numeric values to the factor levels based on the mapping
app_train3$NAME_FAMILY_STATUS <- as.integer(app_train3$NAME_FAMILY_STATUS)
app_test3$NAME_FAMILY_STATUS <- as.integer(app_test3$NAME_FAMILY_STATUS)

# Now we can verify the changes
unique(app_test3$NAME_FAMILY_STATUS)
## [1] 3 1 2 5 4
# and we saw hat we need to handle missing data
# Create a subset of the data without missing values
no_missing_train <- app_train3$NAME_FAMILY_STATUS[!is.na(app_train3$NAME_FAMILY_STATUS)]
no_missing_test <- app_test3$NAME_FAMILY_STATUS[!is.na(app_test3$NAME_FAMILY_STATUS)]

# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$NAME_FAMILY_STATUS)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$NAME_FAMILY_STATUS)), replace = TRUE)

# Replace missing values with random values in the train dataset
app_train3$NAME_FAMILY_STATUS[is.na(app_train3$NAME_FAMILY_STATUS)] <- random_values_train

# Replace missing values with random values in the test dataset
app_test3$NAME_FAMILY_STATUS[is.na(app_test3$NAME_FAMILY_STATUS)] <- random_values_test

# Plot the distribution of AMT_ANNUITY
hist(app_test3$NAME_FAMILY_STATUS, 
     main = "Distribution of family status in test sample",
     xlab = "family status")

hist(app_train3$NAME_FAMILY_STATUS, 
     main = "Distribution of family status in train sample",
     xlab = "family status")



13. DAYS_EMPLOYED

# Summary statistics of DAYS_EMPLOYED
summary(app_test3$DAYS_EMPLOYED)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -17463   -2910   -1293   67485    -296  365243
summary(app_train3$DAYS_EMPLOYED)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -17912   -2760   -1213   63815    -289  365243
sum(is.na(app_test3$DAYS_EMPLOYED))
## [1] 0
sum(is.na(app_train3$DAYS_EMPLOYED))
## [1] 0
# Plot the distribution of DAYS_EMPLOYED
hist(app_test3$DAYS_EMPLOYED, 
     main = "Distribution of employed days in test sample",
     xlab = "employed days")

hist(app_train3$DAYS_EMPLOYED, 
     main = "Distribution of employed days in train sample",
     xlab = "employed days")



14. REGION_RATING_CLIENT

# Summary statistics of REGION_RATING_CLIENT
summary(app_test3$REGION_RATING_CLIENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.038   2.000   3.000
summary(app_train3$REGION_RATING_CLIENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.052   2.000   3.000
sum(is.na(app_test3$REGION_RATING_CLIENT))
## [1] 0
sum(is.na(app_train3$REGION_RATING_CLIENT))
## [1] 0
# Plot the distribution of REGION_RATING_CLIENT
hist(app_test3$REGION_RATING_CLIENT, 
     main = "Distribution of region rating client in test sample",
     xlab = "region rating client")

hist(app_train3$REGION_RATING_CLIENT, 
     main = "Distribution of region rating client in train sample",
     xlab = "region rating client")



15. DOCUMENT

# Summary statistics of DOCUMENT
summary(app_test3$DOCUMENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.05000 0.05000 0.04923 0.05000 0.05000
summary(app_train3$DOCUMENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.05000 0.05000 0.04651 0.05000 0.20000
sum(is.na(app_test3$DOCUMENT))
## [1] 0
sum(is.na(app_train3$DOCUMENT))
## [1] 0
app_test3$DOCUMENT <- ifelse(app_test3$DOCUMENT > 0, 1, 0)
app_train3$DOCUMENT <- ifelse(app_train3$DOCUMENT > 0, 1, 0)

# Plot the distribution of DOCUMENT
hist(app_test3$DOCUMENT, 
     main = "Distribution of DOCUMENT in test sample",
     xlab = "DOCUMENT")

hist(app_train3$DOCUMENT, 
     main = "Distribution of DOCUMENT in train sample",
     xlab = "DOCUMENT")



16. EXT_SOURCE_1

# Summary statistics of EXT_SOURCE_1
summary(app_test3$EXT_SOURCE_1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.013   0.344   0.507   0.501   0.666   0.939   20532
summary(app_train3$EXT_SOURCE_1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.01    0.33    0.51    0.50    0.68    0.96  173378
sum(is.na(app_test3$EXT_SOURCE_1))
## [1] 20532
sum(is.na(app_train3$EXT_SOURCE_1))
## [1] 173378
# Create a subset of the data without missing values
no_missing_train <- app_train3$EXT_SOURCE_1[!is.na(app_train3$EXT_SOURCE_1)]
no_missing_test <- app_test3$EXT_SOURCE_1[!is.na(app_test3$EXT_SOURCE_1)]

# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$EXT_SOURCE_1)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$EXT_SOURCE_1)), replace = TRUE)

# Replace missing values with random values in the train dataset
app_train3$EXT_SOURCE_1[is.na(app_train3$EXT_SOURCE_1)] <- random_values_train

# Replace missing values with random values in the test dataset
app_test3$EXT_SOURCE_1[is.na(app_test3$EXT_SOURCE_1)] <- random_values_test

# Plot the distribution of EXT_SOURCE_1
hist(app_test3$EXT_SOURCE_1, 
     main = "Distribution of EXT_SOURCE_1 in test sample",
     xlab = "EXT_SOURCE_1")

hist(app_train3$EXT_SOURCE_1, 
     main = "Distribution of EXT_SOURCE_1 in train sample",
     xlab = "EXT_SOURCE_1")



17. EXT_SOURCE_2

# Summary statistics of EXT_SOURCE_2
summary(app_test3$EXT_SOURCE_2)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.000008 0.408066 0.558758 0.518021 0.658497 0.855000        8
summary(app_train3$EXT_SOURCE_2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.3925  0.5660  0.5144  0.6636  0.8550     660
sum(is.na(app_test3$EXT_SOURCE_2))
## [1] 8
sum(is.na(app_train3$EXT_SOURCE_2))
## [1] 660
# Create a subset of the data without missing values
no_missing_train <- app_train3$EXT_SOURCE_2[!is.na(app_train3$EXT_SOURCE_2)]
no_missing_test <- app_test3$EXT_SOURCE_2[!is.na(app_test3$EXT_SOURCE_2)]

# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$EXT_SOURCE_2)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$EXT_SOURCE_2)), replace = TRUE)

# Replace missing values with random values in the train dataset
app_train3$EXT_SOURCE_2[is.na(app_train3$EXT_SOURCE_2)] <- random_values_train

# Replace missing values with random values in the test dataset
app_test3$EXT_SOURCE_2[is.na(app_test3$EXT_SOURCE_2)] <- random_values_test

# Plot the distribution of EXT_SOURCE_2
hist(app_test3$EXT_SOURCE_2, 
     main = "Distribution of EXT_SOURCE_2 in test sample",
     xlab = "EXT_SOURCE_2")

hist(app_train3$EXT_SOURCE_2, 
     main = "Distribution of EXT_SOURCE_2 in train sample",
     xlab = "EXT_SOURCE_2")



18. EXT_SOURCE_3

# Summary statistics of EXT_SOURCE_3
summary(app_test3$EXT_SOURCE_3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.001   0.364   0.519   0.500   0.653   0.883    8668
summary(app_train3$EXT_SOURCE_3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.37    0.54    0.51    0.67    0.90   60965
sum(is.na(app_test3$EXT_SOURCE_3))
## [1] 8668
sum(is.na(app_train3$EXT_SOURCE_3))
## [1] 60965
# Create a subset of the data without missing values
no_missing_train <- app_train3$EXT_SOURCE_3[!is.na(app_train3$EXT_SOURCE_3)]
no_missing_test <- app_test3$EXT_SOURCE_3[!is.na(app_test3$EXT_SOURCE_3)]

# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$EXT_SOURCE_3)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$EXT_SOURCE_3)), replace = TRUE)

# Replace missing values with random values in the train dataset
app_train3$EXT_SOURCE_3[is.na(app_train3$EXT_SOURCE_3)] <- random_values_train

# Replace missing values with random values in the test dataset
app_test3$EXT_SOURCE_3[is.na(app_test3$EXT_SOURCE_3)] <- random_values_test

# Plot the distribution of EXT_SOURCE_3
hist(app_test3$EXT_SOURCE_3, 
     main = "Distribution of EXT_SOURCE_3 in test sample",
     xlab = "EXT_SOURCE_3")

hist(app_train3$EXT_SOURCE_3, 
     main = "Distribution of EXT_SOURCE_3 in train sample",
     xlab = "EXT_SOURCE_3")



19. AMT_REQ_CREDIT_BUREAU_QRT

# Summary statistics of AMT_REQ_CREDIT_BUREAU_QRT
summary(app_test3$AMT_REQ_CREDIT_BUREAU_QRT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.547   1.000   7.000    6049
summary(app_train3$AMT_REQ_CREDIT_BUREAU_QRT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00    0.27    0.00  261.00   41519
sum(is.na(app_test3$AMT_REQ_CREDIT_BUREAU_QRT))
## [1] 6049
sum(is.na(app_train3$AMT_REQ_CREDIT_BUREAU_QRT))
## [1] 41519
# Create a subset of the data without missing values
no_missing_train <- app_train3$AMT_REQ_CREDIT_BUREAU_QRT[!is.na(app_train3$AMT_REQ_CREDIT_BUREAU_QRT)]
no_missing_test <- app_test3$AMT_REQ_CREDIT_BUREAU_QRT[!is.na(app_test3$AMT_REQ_CREDIT_BUREAU_QRT)]

# Generate random values from the train subset distribution
random_values_train <- sample(no_missing_train, sum(is.na(app_train3$AMT_REQ_CREDIT_BUREAU_QRT)), replace = TRUE)
random_values_test <- sample(no_missing_test, sum(is.na(app_test3$AMT_REQ_CREDIT_BUREAU_QRT)), replace = TRUE)

# Replace missing values with random values in the train dataset
app_train3$AMT_REQ_CREDIT_BUREAU_QRT[is.na(app_train3$AMT_REQ_CREDIT_BUREAU_QRT)] <- random_values_train

# Replace missing values with random values in the test dataset
app_test3$AMT_REQ_CREDIT_BUREAU_QRT[is.na(app_test3$AMT_REQ_CREDIT_BUREAU_QRT)] <- random_values_test

# Plot the distribution of AMT_REQ_CREDIT_BUREAU_QRT
hist(app_test3$AMT_REQ_CREDIT_BUREAU_QRT, 
     main = "Distribution of AMT_REQ_CREDIT_BUREAU_QRT in test sample",
     xlab = "AMT_REQ_CREDIT_BUREAU_QRT")

hist(app_train3$AMT_REQ_CREDIT_BUREAU_QRT, 
     main = "Distribution of AMT_REQ_CREDIT_BUREAU_QRT in train sample",
     xlab = "AMT_REQ_CREDIT_BUREAU_QRT")



Change to binary variable, where if there were a request it is 1, if not - 0.

app_test3$AMT_REQ_CREDIT_BUREAU_QRT <- ifelse(app_test3$AMT_REQ_CREDIT_BUREAU_QRT > 0, 1, 0)
app_train3$AMT_REQ_CREDIT_BUREAU_QRT <- ifelse(app_train3$AMT_REQ_CREDIT_BUREAU_QRT > 0, 1, 0)

hist(app_test3$AMT_REQ_CREDIT_BUREAU_QRT, 
     main = "Distribution of AMT_REQ_CREDIT_BUREAU_QRT in test sample",
     xlab = "AMT_REQ_CREDIT_BUREAU_QRT")

hist(app_train3$AMT_REQ_CREDIT_BUREAU_QRT, 
     main = "Distribution of AMT_REQ_CREDIT_BUREAU_QRT in train sample",
     xlab = "AMT_REQ_CREDIT_BUREAU_QRT")

str(app_train3)
## 'data.frame':    307511 obs. of  21 variables:
##  $ SK_ID_CURR               : num  1e+05 1e+05 1e+05 1e+05 1e+05 ...
##  $ TARGET                   : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ NAME_CONTRACT_TYPE       : int  0 0 1 0 0 0 0 0 0 1 ...
##  $ CODE_GENDER              : num  0 1 0 1 0 0 1 0 1 0 ...
##  $ FLAG_OWN_CAR             : num  0 0 1 0 0 0 1 1 0 0 ...
##  $ FLAG_OWN_REALTY          : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ CNT_CHILDREN             : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ AMT_INCOME_TOTAL         : num  202500 270000 67500 135000 121500 ...
##  $ AMT_CREDIT               : num  12.9 14.1 11.8 12.7 13.1 ...
##  $ AMT_ANNUITY              : num  10.11 10.48 8.82 10.3 9.99 ...
##  $ AMT_GOODS_PRICE          : num  12.8 13.9 11.8 12.6 13.1 ...
##  $ NAME_TYPE_SUITE          : num  0 1 0 0 0 1 0 0 1 0 ...
##  $ NAME_EDUCATION_TYPE      : num  0 1 0 0 0 0 1 1 0 0 ...
##  $ NAME_FAMILY_STATUS       : int  1 3 1 2 1 3 3 3 3 1 ...
##  $ DAYS_EMPLOYED            : num  -637 -1188 -225 -3039 -3038 ...
##  $ REGION_RATING_CLIENT     : num  2 1 2 2 2 2 2 3 2 2 ...
##  $ DOCUMENT                 : num  1 1 0 1 1 1 1 1 1 0 ...
##  $ EXT_SOURCE_1             : num  0.083 0.311 0.659 0.34 0.774 ...
##  $ EXT_SOURCE_2             : num  0.263 0.622 0.556 0.65 0.323 ...
##  $ EXT_SOURCE_3             : num  0.139 0.202 0.73 0.774 0.569 ...
##  $ AMT_REQ_CREDIT_BUREAU_QRT: num  0 0 0 0 0 1 1 0 0 0 ...
# Load the corrplot package
library(corrplot)

# Calculate correlation matrix
cor_matrix <- cor(app_train3)

# Create heatmap
heatmap(cor_matrix, 
        cmap = colorRampPalette(c("blue", "white", "red")), 
        main = "Correlation Matrix")
## Warning in plot.window(...): "cmap" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "cmap" is not a graphical parameter
## Warning in title(...): "cmap" is not a graphical parameter

app_train4<-app_train3
app_test4<-app_test3

#Karolina 
# file_path <- "C:\\Users\\karla\\OneDrive\\Pulpit\\Credit Risk\\Projekt\\app_train4.csv"

# Save the dataframe as a CSV file
# write_csv(app_train4, file_path)
# file_path <- "C:\\Users\\karla\\OneDrive\\Pulpit\\Credit Risk\\Projekt\\app_test4.csv"
# write_csv(app_test4, file_path)

save(app_train4, file = "dane/app_train4.RData")
save(app_test4, file = "dane/app_test4.RData")